Embryo measurements | Schematic representation of the measurements taken
# Load libraries
library(tidyverse)
library(here)
library(ggridges) # for ridge plot visualization
library(reshape2) # to use melt()
library(patchwork) # to combine plots with wrap_plots()
#
## Import raw data
#
raw_embryo_length <-
readr::read_delim(
file = here::here("chick_elong_overlap/data_embryo_elongation/input/ChickElongQuant_input.csv"),
delim = ",",
na = "",
escape_double = FALSE,
trim_ws = TRUE,
locale = locale(decimal_mark = ".") # Important | To deal with the fact that in Portugal, sometimes, the decimals are commas.
)
#
## Format the dataset
#
# Vector with the HH stages ordered
hh_levels_order <- c("HH4","HH5","HH6","HH7","HH8","HH9","HH10")
nr_of_embryos <- length(unique(raw_embryo_length$Embryo))
# Wide format
clean_embryo_length <-
raw_embryo_length |>
dplyr::select(-NumberMeasuredFrames, -CultureType) |>
dplyr::filter(!is.na(HHStage)) |>
dplyr::mutate(HHStage = paste0("HH", HHStage),
HHStage = factor(HHStage, levels = hh_levels_order),
Embryo = factor(Embryo, levels = 1:nr_of_embryos)) |>
dplyr::relocate(c(Embryo, HHStage), .before = Time)
# Pivot the data to long format for plotting
clean_embryo_length_longer <-
clean_embryo_length |>
tidyr::pivot_longer(
cols = !c(Embryo, HHStage, Time),
names_to = "Measure",
values_to = "Length_mm"
)
# Measurements description (for plots)
measurements <- tibble::tibble(
measure = c(
"1.Total embryo length (mm)",
"2.Total PS length (mm)",
"3.Notochord length (mm)",
"4.Posterior length (mm)",
"5.PS length (mm)",
"6.PSM length (mm)",
"7.Anterior length (mm)",
"8.Segmented length (mm)",
"9.Head fold length (mm)"
),
acronym = c(
"C-pPL",
"C-PS",
"C-N",
"N-pPL",
"N-PS",
"PSM",
"C-Seg",
"SEG",
"C-HF"
),
my_labs = c(
"C-pPL | Total embryo length",
"C-PS | Total Primitive Streak length",
"C-N | Notochord length",
"N-pPL | Posterior region length",
"N-PS | Primitive Streak length",
"PSM | Presomitic Mesoderm length",
"C-Seg | Anterior region length",
"SEG | Segmented region length",
"C-HF | Head fold length"
)
)
## Summary stats
summary(clean_embryo_length)
#> Embryo HHStage Time C-pPL C-PS
#> 19 : 30 HH4 :43 Min. :-7.600 Min. :1.630 Min. :1.530
#> 5 : 28 HH5 :99 1st Qu.: 3.362 1st Qu.:3.618 1st Qu.:3.160
#> 3 : 25 HH6 :50 Median : 9.150 Median :4.715 Median :4.140
#> 4 : 25 HH7 :47 Mean : 9.188 Mean :4.585 Mean :4.024
#> 8 : 25 HH8 :71 3rd Qu.:15.090 3rd Qu.:5.572 3rd Qu.:4.878
#> 11 : 24 HH9 :57 Max. :24.150 Max. :7.050 Max. :6.400
#> (Other):233 HH10:23 NA's :2 NA's :4
#> C-N N-pPL N-PS PSM
#> Min. :0.080 Min. :1.020 Min. :0.870 Min. :0.3000
#> 1st Qu.:1.185 1st Qu.:2.110 1st Qu.:1.520 1st Qu.:0.6400
#> Median :2.380 Median :2.450 Median :1.890 Median :0.8100
#> Mean :2.318 Mean :2.518 Mean :1.915 Mean :0.8443
#> 3rd Qu.:3.380 3rd Qu.:2.910 3rd Qu.:2.225 3rd Qu.:1.0200
#> Max. :5.000 Max. :3.860 Max. :3.110 Max. :1.5900
#> NA's :43 NA's :45 NA's :47 NA's :192
#> C-Seg SEG C-HF
#> Min. :1.180 Min. :0.0800 Min. :0.0600
#> 1st Qu.:1.817 1st Qu.:0.3600 1st Qu.:0.4150
#> Median :1.933 Median :0.6550 Median :0.8700
#> Mean :1.944 Mean :0.6816 Mean :0.8743
#> 3rd Qu.:2.070 3rd Qu.:1.0100 3rd Qu.:1.2650
#> Max. :2.310 Max. :1.7300 Max. :1.9000
#> NA's :214 NA's :192 NA's :155
## Look at the Number of embryos per HH stage
clean_embryo_length |>
# Reverse the order of the stacks (from bottom to top)
dplyr::mutate(HHStage = factor(HHStage, levels = rev(unique(HHStage)))) |>
dplyr::count(Embryo, HHStage) |>
ggplot(aes(x = Embryo, y = n, fill = HHStage)) +
geom_col() +
ylab("Number of measures taken per HH Stage") +
ggtitle("Number of Measures per HHStage in Each Embryo") +
geom_text(aes(label = n), position = position_stack(vjust = 0.5),
color = "black", size = 3.5) +
theme_minimal()
## Ridge plots | Per Time
plot_time <-
clean_embryo_length_longer |>
ggplot(mapping = aes(x = Time, y =HHStage, fill = HHStage)) +
geom_density_ridges(
alpha = 0.75,
rel_min_height = 0.001,
scale = 1,
quantile_lines = TRUE,
quantiles = 2,
jittered_points = TRUE,
position = "points_sina",
point_alpha = 0.6,
point_size = 0.5
) +
theme_ridges(grid = TRUE, center_axis_labels = TRUE) +
theme(legend.position = "none") +
#xlim(c(0, 7.1)) +
#scale_x_continuous(breaks = seq(0, 8, 1)) +
xlab("Time (min)") +
ylab('HH stage') +
ggtitle("Embryo time distribution per HH stage")
print(plot_time)
# Ridge plots | Faceted per measure
ggplot(clean_embryo_length_longer, aes(x = Length_mm, y = HHStage, fill = HHStage)) +
ggridges::geom_density_ridges(
alpha = 0.75,
rel_min_height = 0.001,
scale = 2,
quantile_lines = TRUE,
quantiles = 2,
jittered_points = TRUE,
position = "points_sina",
point_alpha = 0.8,
point_size = 0.5
) +
theme_ridges(grid = TRUE, center_axis_labels = TRUE) +
facet_wrap(~Measure, scales = "free_x") +
theme(legend.position = "none") +
labs(
title = "Embryo Measurement Distributions per HH Stage",
x = "Measurement Value",
y = "HH Stage"
)
# Ridge plots | Individual plots per measure
for (my_measure in measurements$acronym) {
plot_data <- clean_embryo_length_longer |>
dplyr::filter(Measure == my_measure) |>
drop_na(Length_mm)
p <- ggplot(plot_data, aes(x = Length_mm, y = HHStage, fill = HHStage)) +
geom_density_ridges(
alpha = 0.75,
rel_min_height = 0.001,
scale = 2,
quantile_lines = TRUE,
quantiles = 2,
jittered_points = TRUE,
position = "points_sina",
point_alpha = 0.8
) +
scale_fill_manual(values = scales::hue_pal()(7), limits = levels(clean_embryo_length_longer$HHStage)) +
theme_ridges(grid = TRUE, center_axis_labels = TRUE) +
theme(legend.position = "none") +
labs(
title = paste("Distribution of", my_measure, "by HH Stage"),
x = paste(my_measure, "(mm)"),
y = "HH Stage"
)
print(p)
}
Goal: Calculate the pairwise overlap between measurement distributions.
Overview of the approach
The idea is to calculate the integral of the minimum of the two density functions over the range where both have nonzero density. This provides a straightforward measure of similarity between two distributions, and therefore an intuitive way to quantify the overlap between the ridge plots (or kernel density estimates).
Simple Overlap Calculation
Mathematically:
Overlap = ∫ min(f1(x), f2(x)) dx
where f1(x) and f2(x) are the density functions of the two distributions.
Why This Works
# Function to compute overlap between two distributions
calculate_overlap <- function(x1, x2, bw = "SJ", n = 512) {
d1 <- density(x1, bw = bw, n = n)
d2 <- density(x2, bw = bw, n = n)
# Find common range where densities are nonzero
lower_bound <- max(min(d1$x), min(d2$x))
upper_bound <- min(max(d1$x), max(d2$x))
# If there is no overlap, return 0
if (lower_bound >= upper_bound) return(0)
# Define the function for the minimum of two densities
min_density_function <- function(x) {
f1 <- approx(d1$x, d1$y, xout = x, rule = 2)$y
f2 <- approx(d2$x, d2$y, xout = x, rule = 2)$y
pmin(f1, f2)
}
# Compute the integral over the overlapping range
overlap_value <- integrate(min_density_function, lower = lower_bound, upper = upper_bound)$value
return(overlap_value)
}
# Function to compute pairwise overlaps and generate plots
calculate_pairwise_overlap <- function(data_list, my_name, bw = "SJ", n = 512) {
names_list <- str_sort(names(data_list), numeric = TRUE)
if (is.null(names_list)) names_list <- paste0("D", seq_along(data_list))
n_datasets <- length(data_list)
overlap_matrix <- matrix(0, nrow = n_datasets, ncol = n_datasets, dimnames = list(names_list, names_list))
for (i in seq_len(n_datasets)) {
for (j in seq_len(n_datasets)) {
if (i <= j) {
overlap_matrix[i, j] <- calculate_overlap(data_list[[i]], data_list[[j]], bw = bw, n = n)
overlap_matrix[j, i] <- overlap_matrix[i, j] # Symmetric matrix
}
}
}
#
# Plots
#
# Convert matrix to long format for ggplot
overlap_df <- reshape2::melt(overlap_matrix)
colnames(overlap_df) <- c("Distribution1", "Distribution2", "Overlap")
# Heatmap plot
heatmap_plot <-
overlap_df |>
filter(as.integer(Distribution2) <= as.integer(Distribution1)) |>
ggplot(aes(x = Distribution1, y = Distribution2, fill = Overlap)) +
geom_tile() +
geom_text(aes(label = round(Overlap, 3)), color = "grey25", size = 5) +
scale_fill_gradient(low = "white", high = "salmon") +
theme_minimal() +
labs(title = paste(my_name, "| Density Overlap"), fill = "Overlap") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Convert list to long format for ridge plot
ridge_df <- do.call(rbind, lapply(names_list, function(name) {
data.frame(Value = data_list[[name]], Distribution = name)
}))
# Convert Distribution to a factor with numerically ordered levels
ridge_df$Distribution <- factor(ridge_df$Distribution, levels = names_list)
# Ridge plot
ridge_plot <- ggplot(ridge_df, aes(x = Value, y = Distribution, fill = Distribution)) +
geom_density_ridges(alpha=0.75, rel_min_height = 0.001, scale = 1,
quantile_lines = TRUE, quantiles=2,
jittered_points = TRUE, position = "points_sina",
point_alpha = 0.7, point_size = 1, point_shape = 20) +
scale_fill_manual(values = scales::hue_pal()(7), limits = levels(clean_embryo_length_longer$HHStage)) +
theme_minimal() +
#scale_fill_brewer(palette = "Set3") +
labs(title = measurements[which(measurements$acronym == my_name), ]$my_labs,
x = "Length (mm)",
y = "HH Stage") +
theme(legend.position = "none")
# Return both plots as a list
# return(list(ridge_plot = ridge_plot, heatmap = heatmap_plot, overlap_matrix = overlap_matrix))
return(list(ridge_plot = ridge_plot, heatmap = heatmap_plot))
}
# Format the input data as list
clean_embryo_length_list <- clean_embryo_length |>
select(-Time, -Embryo) |>
pivot_longer(-HHStage, names_to = "Med", values_to = "Length") |>
drop_na() |>
group_by(Med, HHStage) |>
summarise(Length = list(Length), .groups = "drop") |>
nest(Length = c(HHStage, Length)) |>
mutate(Length = map(Length, ~set_names(.x$Length, .x$HHStage))) |>
deframe()
# Calculate the distribution overlap for all measurements
overlap_plots <- setNames(
lapply(names(clean_embryo_length_list), function(name) {
calculate_pairwise_overlap(clean_embryo_length_list[[name]], name)
}),
names(clean_embryo_length_list)
)
# Print the plots
combined_plots <- lapply(overlap_plots, function(sublist) {
patchwork::wrap_plots(sublist, ncol = 2) # Combines the plots at this level
})
print(combined_plots)
#> $`C-HF`
#>
#> $`C-N`
#>
#> $`C-PS`
#>
#> $`C-Seg`
#>
#> $`C-pPL`
#>
#> $`N-PS`
#>
#> $`N-pPL`
#>
#> $PSM
#>
#> $SEG
### Save the RData file ## Uncomment if needed
# save.image(file = here::here("chick_elong_overlap/data_embryo_elongation/embryo_elongation_analysis.RData"))
# Session info
sessionInfo()
#> R version 4.5.0 (2025-04-11)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.2 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Europe/Lisbon
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] patchwork_1.3.0 reshape2_1.4.4 ggridges_0.5.6 here_1.0.1
#> [5] lubridate_1.9.4 forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4
#> [9] purrr_1.0.4 readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
#> [13] ggplot2_3.5.2 tidyverse_2.0.0
#>
#> loaded via a namespace (and not attached):
#> [1] sass_0.4.10 generics_0.1.4 stringi_1.8.7 hms_1.1.3
#> [5] digest_0.6.37 magrittr_2.0.3 evaluate_1.0.3 grid_4.5.0
#> [9] timechange_0.3.0 RColorBrewer_1.1-3 fastmap_1.2.0 rprojroot_2.0.4
#> [13] plyr_1.8.9 jsonlite_2.0.0 scales_1.4.0 jquerylib_0.1.4
#> [17] cli_3.6.5 rlang_1.1.6 crayon_1.5.3 bit64_4.6.0-1
#> [21] withr_3.0.2 cachem_1.1.0 yaml_2.3.10 tools_4.5.0
#> [25] parallel_4.5.0 tzdb_0.5.0 vctrs_0.6.5 R6_2.6.1
#> [29] lifecycle_1.0.4 bit_4.6.0 vroom_1.6.5 pkgconfig_2.0.3
#> [33] pillar_1.10.2 bslib_0.9.0 gtable_0.3.6 glue_1.8.0
#> [37] Rcpp_1.0.14 xfun_0.52 tidyselect_1.2.1 rstudioapi_0.17.1
#> [41] knitr_1.50 farver_2.1.2 htmltools_0.5.8.1 labeling_0.4.3
#> [45] rmarkdown_2.29 compiler_4.5.0